function [sample_cross] = generate_cross_sample_baseline_fullsample(sample,wave,winter_prevuse,pre_avg_n,individual_ps,post_avg_n)

% This function generates the cross-sectional sample for EWM analysis using
% measures of baseline consumption. 

% Input: 
% (1) sample: panel data of electricity consumption households over time
% and covariates (cross-sectional). 
% (2) wave: =0 if pooled sample, or 3, 6, 7 for wave-specific results
% (3) winter_prevuse: indicates whether baseline consumption is calculated 
% as the mean of consumption in winter months (Jan and Feb) or as the mean
% of specified pre-treatment periods
% (4) pre_avg_n: number of pre-treatment periods 
% (5) individual_ps:  % =1 if pre-treatment average calculated for Jan and Feb 
% each year; =0 if pre-treatment average calculated using specified number
% (6) post_avg_n: number of post-treatment periods 


% Output: 
% (1) sample_cross_baseline: collapse panel into cross-section for
% covariates,and statistics of baseline consumption (min, max, std). 

%% Import panel dataset
% import panel dataset for baseline statistics construction
pre_avg_string=sprintf('pre_avg_%1.0f',pre_avg_n); % generate baseline consumption variable name 
    
missing= (isnan(sample.(pre_avg_string))) | (isnan(sample.post_avg));
sample_clean_panel=sample(~missing,:);
zero= (sample_clean_panel.(pre_avg_string)==0) | (sample_clean_panel.post_avg==0);

size_nan=isnan(sample_clean_panel.uni_size);
vintage_nan=isnan(sample_clean_panel.vintage);
sample_clean_panel.uni_size((sample_clean_panel.uni_size>5000 & size_nan==0),:)=5000;
sample_clean_panel.vintage((sample_clean_panel.vintage<1850 & vintage_nan==0),:)=1850;
sample_panel=sample_clean_panel(~zero,:);

%% Define panel based on post_avg_n
if post_avg_n==0
    usage_post=array2table(sample_panel{sample_panel.post_opower==1,{'ky_ba','tot_kwh'}});
    opower=array2table(sample_panel{sample_panel.post_opower==1,{'ky_ba','opower'}});
else
    usage_post=array2table(sample_panel{sample_panel.month_since_opower<post_avg_n & sample_panel.month_since_opower>=0,{'ky_ba','tot_kwh'}});
    opower=array2table(sample_panel{sample_panel.month_since_opower<post_avg_n & sample_panel.month_since_opower>=0,{'ky_ba','opower'}});
end
usage_post.Properties.VariableNames={'ky_ba','tot_kwh'};
opower.Properties.VariableNames={'ky_ba','opower'};
usage_post_mean=grpstats(usage_post,'ky_ba','mean','DataVars','tot_kwh');
post_avg=usage_post_mean.mean_tot_kwh;
opower_mean=grpstats(opower,'ky_ba','mean','DataVars','opower');
opower=opower_mean.mean_opower;

% This line redefines the panel to exclude accounts that have missing
% post_avg (depending on the number of post period selected)
sample_panel=sample_panel(ismember(sample_panel.ky_ba,usage_post.ky_ba),:);

%% Collapse panel dataset into cross-section for covariates
G=findgroups(sample_panel.ky_ba);
income=splitapply(@mean,sample_panel.income,G)/1000;
size=splitapply(@mean,sample_panel.uni_size,G);
vintage=splitapply(@mean,sample_panel.vintage,G);
building_size = splitapply(@mean,sample_panel.building_size,G);
member = splitapply(@mean,sample_panel.member,G);
ky_ba=splitapply(@mean,sample_panel.ky_ba,G);
opower_paper_wave=splitapply(@mean,sample_panel.opower_paper_wave,G);

%% Collapse panel dataset into cross-section for average pre-treatment consumption 
% Depending on choice of baseline period 
if (winter_prevuse)
    pre_avg=round(splitapply(@mean,sample_panel.pre_avg_winter_recent,G));
    pre_avg(isnan(pre_avg))=0;
else
    pre_avg=round(splitapply(@mean,sample_panel.(pre_avg_string),G));
end
        
%% Calculate statistics of baseline consumption
baseline=array2table(sample_panel{sample_panel.post_opower==0,{'ky_ba','tot_kwh'}});
baseline.Properties.VariableNames={'ky_ba','tot_kwh'};
min_baseline=grpstats(baseline,'ky_ba','min','DataVars','tot_kwh');
min_baseline=min_baseline.min_tot_kwh;
%
baseline=array2table(sample_panel{sample_panel.post_opower==0,{'ky_ba','tot_kwh'}});
baseline.Properties.VariableNames={'ky_ba','tot_kwh'};
max_baseline=grpstats(baseline,'ky_ba','max','DataVars','tot_kwh');
max_baseline=max_baseline.max_tot_kwh;
%
baseline=array2table(sample_panel{sample_panel.post_opower==0,{'ky_ba','tot_kwh'}});
baseline.Properties.VariableNames={'ky_ba','tot_kwh'};
std_baseline=grpstats(baseline,'ky_ba','std','DataVars','tot_kwh');
std_baseline=std_baseline.std_tot_kwh;

%% Choose propensity score
% get estimated propensity score from a separate file as needed
ps_wave3=sum(opower(opower_paper_wave==3))/sum(opower_paper_wave==3);
ps_wave6=sum(opower(opower_paper_wave==6))/sum(opower_paper_wave==6);
ps_wave7=sum(opower(opower_paper_wave==7))/sum(opower_paper_wave==7);
e=zeros(length(opower_paper_wave),1);
e(opower_paper_wave==3)=ps_wave3;
e(opower_paper_wave==6)=ps_wave6;
e(opower_paper_wave==7)=ps_wave7;
ps=e;

%% Define variables for export
    sample_cross=[ky_ba opower opower_paper_wave post_avg pre_avg income ...
        size vintage building_size member ps min_baseline max_baseline std_baseline];
    sample_cross=array2table(sample_cross,...
    'VariableNames',{'ky_ba','opower','opower_paper_wave','post_avg','pre_avg','income','uni_size','vintage','building_size','member'...
    'ps','min_baseline','max_baseline','std_baseline'});

% parse dataset based on wave selection
switch wave 
    case 0
    case 3 
        sample_cross=sample_cross(sample_cross.opower_paper_wave==3,:);
    case 6
        sample_cross=sample_cross(sample_cross.opower_paper_wave==6,:);
    case 7
        sample_cross=sample_cross(sample_cross.opower_paper_wave==7,:);
end 
end